# import functions
import pandas as pd
# Read enriched dataset
data_enriched = pd.read_csv("data_enriched.csv", parse_dates=['Zeitstempel'])
# Zeitraum der Daten
print(data_enriched['Zeitstempel'].min())
print(data_enriched['Zeitstempel'].max())
# Drop ArtikelNr und Zeitstempel
data_enriched = data_enriched.drop(['ArtikelNr', 'Zeitstempel', 'Menge_log'], axis=1)
2018-01-02 00:00:00 2022-02-22 00:00:00
# Alle Artikel
all_articles = data_enriched['Artikel'].value_counts().index.tolist()
all_articles
['Laugen-Gipfel of', 'Butter-Gipfel / Croissant of', 'caffè crema', 'St. Galler Handbürli Culinarium *Gold prämiert*', 'Körnergipfel of', 'Weggli', 'Mais-Gipfel of', 'Sonnenblumenbrötli of', 'St. Galler Handb dk. Culinarium *Gold prämiert*', 'Semmel of', 'Nuss-Stengel Original', 'Pain Roule rustico klein of', 'Gallusbrot 400 of', 'Mais-Brötli süss of', 'Berliner m Confi Himbeer of', 'Erdbeertörtli gross ', 'Butterzopf 440 2-teilig ', 'Ziger-Krapfen of']
# Funktionen importieren
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
# Function to calculate the score
def calculate_score(result):
# Calculate score
score = (0.6 * result['R2'] * (1 - result['MAPE']) * (1 / result['RMSE'])) + (0.2 * (1 - result['MAPE']) * (1 / result['RMSE'])) + (0.2 * result['R2'])
return score
# Create dictionary for best_models
best_models = {}
# Dateframe for all metrics
all_results = pd.DataFrame()
# Define hyperparameters for random search
rf_params = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 50],
'min_samples_split': [2, 10],
'min_samples_leaf': [1, 4],
'bootstrap': [True, False]
}
xgb_params = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 50],
'learning_rate': [0.01, 0.1, 0.3],
'subsample': [0.5, 1.0],
'colsample_bytree': [0.4, 1.0],
'min_child_weight': [1, 4]
}
# Loop over articles
for artikel in all_articles:
# Filter by Artikel
data_filtered = data_enriched[data_enriched['Artikel'] == artikel].drop(['Artikel'], axis=1)
X = data_filtered.drop(['Menge'], axis=1)
y = data_filtered['Menge']
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4153)
models = [
('LinearRegression', LinearRegression(), {}),
('RandomForestRegressor', RandomForestRegressor(), rf_params),
('XGBRegressor', XGBRegressor(), xgb_params)
]
# Initialize a variable to keep track of the best RMSE and corresponding model
best_rmse = float('inf')
best_model_info = None
for name, model, params in models:
if params:
# Hyperparameter tuning using RandomizedSearchCV
model_cv = RandomizedSearchCV(model, params, cv=5, n_iter=10, random_state=4153)
model_cv.fit(X_train, y_train)
best_model = model_cv.best_estimator_
else:
best_model = model
best_model.fit(X_train, y_train)
# Predict y
y_pred = best_model.predict(X_test)
# Calculate metrics
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_test, y_pred)
# Save results in a dictionary
result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': name}
# Calculate score
result['score'] = calculate_score(result)
# Add to results dataframe
all_results = pd.concat([all_results, pd.DataFrame([result])], ignore_index=True)
# If it's the best model so far, save it
if rmse < best_rmse:
best_rmse = rmse
best_model_info = {'model': best_model, 'metrics': result}
# Save the best model for this artikel
best_models[artikel] = best_model_info
import pickle
# Saving the objects:
with open('best_models.pkl', 'wb') as f:
pickle.dump(best_models, f)
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='score', hue='modelname')
plt.title('Scores for each Artikel')
plt.ylabel('Score')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='R2', hue='modelname')
plt.title('R2 for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='RMSE', hue='modelname')
plt.title('RMSE for each Artikel')
plt.ylabel('RMSE')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
for artikel, models in best_models.items():
print(artikel)
print(models)
Laugen-Gipfel of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10,
n_estimators=300), 'metrics': {'Artikel': 'Laugen-Gipfel of', 'R2': 0.6627369296333037, 'RMSE': 4.87800554001229, 'MAPE': 0.2148137293918103, 'modelname': 'RandomForestRegressor', 'score': 0.22874662415555236}}
Butter-Gipfel / Croissant of
{'model': RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Butter-Gipfel / Croissant of', 'R2': 0.7066574089100415, 'RMSE': 17.982623591987302, 'MAPE': 0.20390633584918344, 'modelname': 'RandomForestRegressor', 'score': 0.16895581712240984}}
caffè crema
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, n_estimators=300), 'metrics': {'Artikel': 'caffè crema', 'R2': 0.6634038050016473, 'RMSE': 10.931616226414894, 'MAPE': 0.25385630688431504, 'modelname': 'RandomForestRegressor', 'score': 0.17350047772249583}}
St. Galler Handbürli Culinarium *Gold prämiert*
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10,
n_estimators=300), 'metrics': {'Artikel': 'St. Galler Handbürli Culinarium *Gold prämiert*', 'R2': 0.21921916977812095, 'RMSE': 7.938905773111951, 'MAPE': 0.48110396370560776, 'modelname': 'RandomForestRegressor', 'score': 0.06551311523215625}}
Körnergipfel of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Körnergipfel of', 'R2': 0.6140729229946368, 'RMSE': 3.8832691832362958, 'MAPE': 0.30771712266985146, 'modelname': 'RandomForestRegressor', 'score': 0.22415287945049006}}
Weggli
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, n_estimators=200), 'metrics': {'Artikel': 'Weggli', 'R2': 0.5221181270101496, 'RMSE': 3.3357022391004483, 'MAPE': 0.2205785840529178, 'modelname': 'RandomForestRegressor', 'score': 0.22435468771556372}}
Mais-Gipfel of
{'model': RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Mais-Gipfel of', 'R2': 0.5863327206561324, 'RMSE': 3.541304074415829, 'MAPE': 0.4789148935736349, 'modelname': 'RandomForestRegressor', 'score': 0.19846108832447573}}
Sonnenblumenbrötli of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, n_estimators=200), 'metrics': {'Artikel': 'Sonnenblumenbrötli of', 'R2': 0.3699374119375518, 'RMSE': 2.5375262070750257, 'MAPE': 0.48600412442625485, 'modelname': 'RandomForestRegressor', 'score': 0.15945929224855931}}
St. Galler Handb dk. Culinarium *Gold prämiert*
{'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.4, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.01, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=10, max_leaves=None,
min_child_weight=4, missing=nan, monotone_constraints=None,
n_estimators=300, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...), 'metrics': {'Artikel': 'St. Galler Handb dk. Culinarium *Gold prämiert*', 'R2': 0.22528609136472566, 'RMSE': 6.286680467429014, 'MAPE': 0.5155542127318328, 'modelname': 'XGBRegressor', 'score': 0.07088523625953266}}
Semmel of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10,
n_estimators=300), 'metrics': {'Artikel': 'Semmel of', 'R2': 0.34121630754748355, 'RMSE': 3.2123947804646136, 'MAPE': 0.42367614697763484, 'modelname': 'RandomForestRegressor', 'score': 0.1408543335579437}}
Nuss-Stengel Original
{'model': LinearRegression(), 'metrics': {'Artikel': 'Nuss-Stengel Original', 'R2': -0.0021114790147238605, 'RMSE': 2.915950677427657, 'MAPE': 0.7041052864028683, 'modelname': 'LinearRegression', 'score': 0.019744052649939302}}
Pain Roule rustico klein of
{'model': RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Pain Roule rustico klein of', 'R2': 0.8154665271885964, 'RMSE': 4.413756384641824, 'MAPE': 0.4852917215344433, 'modelname': 'RandomForestRegressor', 'score': 0.2434733826755387}}
Gallusbrot 400 of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Gallusbrot 400 of', 'R2': 0.3651120092669522, 'RMSE': 3.268461516816381, 'MAPE': 0.30427740262421493, 'modelname': 'RandomForestRegressor', 'score': 0.16222477527197693}}
Mais-Brötli süss of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, n_estimators=200), 'metrics': {'Artikel': 'Mais-Brötli süss of', 'R2': 0.33460189322392575, 'RMSE': 2.017214018376093, 'MAPE': 0.25776153039382643, 'modelname': 'RandomForestRegressor', 'score': 0.21438134684339533}}
Berliner m Confi Himbeer of
{'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=1.0, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.01, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=50, max_leaves=None,
min_child_weight=4, missing=nan, monotone_constraints=None,
n_estimators=300, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...), 'metrics': {'Artikel': 'Berliner m Confi Himbeer of', 'R2': 0.316613154674098, 'RMSE': 8.212453649924912, 'MAPE': 0.5094356651347668, 'modelname': 'XGBRegressor', 'score': 0.08661705037475946}}
Erdbeertörtli gross
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Erdbeertörtli gross ', 'R2': 0.33017165577975394, 'RMSE': 2.518308573781984, 'MAPE': 0.6012645844655415, 'modelname': 'RandomForestRegressor', 'score': 0.12906781489123598}}
Butterzopf 440 2-teilig
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Butterzopf 440 2-teilig ', 'R2': 0.7411862643597246, 'RMSE': 2.6896141238490627, 'MAPE': 0.39365310636856266, 'modelname': 'RandomForestRegressor', 'score': 0.2935811401820453}}
Ziger-Krapfen of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10,
n_estimators=300), 'metrics': {'Artikel': 'Ziger-Krapfen of', 'R2': 0.22389419263674926, 'RMSE': 8.802969741503238, 'MAPE': 0.4924556679328408, 'modelname': 'RandomForestRegressor', 'score': 0.06405535639230567}}
from functions import plot_variable_importance
for artikel, models in best_models.items():
model = models['model']
model_name = models['metrics']['modelname']
plot_variable_importance(model, X_train, model_name, artikel)
from sklearn.ensemble import VotingRegressor
# Create dictionary for best_models
best_models_voting = {}
# Dateframe for all metrics
all_results_voting = pd.DataFrame()
# Loop over articles
for artikel in all_articles:
# Filter by Artikel
data_filtered = data_enriched[data_enriched['Artikel'] == artikel].drop(['Artikel'], axis=1)
X = data_filtered.drop(['Menge'], axis=1)
y = data_filtered['Menge']
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4153)
models = [
('LinearRegression', LinearRegression(), {}),
('RandomForestRegressor', RandomForestRegressor(), rf_params),
('XGBRegressor', XGBRegressor(), xgb_params)
]
# Initialize a variable to keep track of the best RMSE and corresponding model
best_rmse = float('inf')
best_model_info = None
# Initialize list for base estimators of VotingRegressor
estimators = []
for name, model, params in models:
if params:
# Hyperparameter tuning using RandomizedSearchCV
model_cv = RandomizedSearchCV(model, params, cv=5, n_iter=10, random_state=4153)
model_cv.fit(X_train, y_train)
best_model = model_cv.best_estimator_
else:
best_model = model
best_model.fit(X_train, y_train)
# Add the model to the estimators list
estimators.append((name, best_model))
# Predict y
y_pred = best_model.predict(X_test)
# Calculate metrics
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_test, y_pred)
# Save results in a dictionary
result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': name}
# Calculate score
result['score'] = calculate_score(result)
# Add to results dataframe
all_results_voting = pd.concat([all_results_voting, pd.DataFrame([result])], ignore_index=True)
# If it's the best model so far, save it
if rmse < best_rmse:
best_rmse = rmse
best_model_info = {'model': best_model, 'metrics': result}
# Voting Regressor
voting_regressor = VotingRegressor(estimators)
voting_regressor.fit(X_train, y_train)
# Predict y
y_pred = voting_regressor.predict(X_test)
# Calculate metrics
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_test, y_pred)
# Save results in a dictionary
result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': 'VotingRegressor'}
# Calculate score
result['score'] = calculate_score(result)
# Add to results dataframe
all_results_voting = pd.concat([all_results_voting, pd.DataFrame([result])], ignore_index=True)
# If it's the best model so far, save it
if rmse < best_rmse:
best_rmse = rmse
best_model_info = {'model': voting_regressor, 'metrics': result}
# Save the best model and metrics for this artikel
best_models_voting[artikel] = best_model_info
import pickle
# Saving the objects:
with open('best_models_voting.pkl', 'wb') as f:
pickle.dump(best_models_voting, f)
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='score', hue='modelname')
plt.title('Scores for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='R2', hue='modelname')
plt.title('R2 for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='RMSE', hue='modelname')
plt.title('RMSE for each Artikel')
plt.ylabel('RMSE')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
from functions import plot_variable_importance
for artikel, models in best_models_voting.items():
model = models['model']
model_name = models['metrics']['modelname']
plot_variable_importance(model, X_train, model_name, artikel)
No feature importances or coefficients available for model Estimator 0
No feature importances or coefficients available for model Estimator 0
No feature importances or coefficients available for model Estimator 0